Linear train
split_ratio <- 0.7
set.seed(168)
split_index <- createDataPartition(df_data$cases_new, p=split_ratio, list=FALSE)
data_train <- df_data[split_index,]
data_test <- df_data[-split_index,]
linear_model <- lm(cases_new~cases_active,data=data_train)
summary(linear_model)
##
## Call:
## lm(formula = cases_new ~ cases_active, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3742.5 -215.4 -127.7 221.8 4264.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.286e+02 5.175e+01 2.485 0.0133 *
## cases_active 8.244e-02 6.365e-04 129.506 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 948.8 on 494 degrees of freedom
## Multiple R-squared: 0.9714, Adjusted R-squared: 0.9713
## F-statistic: 1.677e+04 on 1 and 494 DF, p-value: < 2.2e-16




linear_prediction <- linear_model %>% predict(data_test)
linear_compare <- data.frame(actual=data_test$cases_new, predicted=linear_prediction)
head(linear_compare)
## actual predicted
## 1 4 128.9024
## 5 3 129.1497
## 8 0 129.2322
## 10 0 129.2322
## 14 1 129.7268
## 16 1 129.7268
linear_performance <- data.frame(
MODEL = "Gaussian Linear",
R2 = R2(linear_prediction, data_test$cases_new),
RMSE = RMSE(linear_prediction, data_test$cases_new),
MAE = MAE(linear_prediction, data_test$cases_new)
)
linear_performance
## MODEL R2 RMSE MAE
## 1 Gaussian Linear 0.9721543 906.3865 545.7654
# Chart init
df_predicted <- data.frame(date=data_test$date, cases_new=linear_prediction)
df_actual <- data_test
df_train <- data_train
lm_chart <- plot_ly()
# Predicted Data
lm_chart <- lm_chart %>%
add_trace(
x = df_predicted[["date"]], y = df_predicted[["cases_new"]],
name = "Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
lm_chart <- lm_chart %>%
add_trace(
x = df_actual[["date"]], y = df_actual[["cases_new"]],
name = "Actual Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
lm_chart <- lm_chart %>%
add_trace(
x = df_train[["date"]], y = df_train[["cases_new"]],
name = "Train Data",
type = "scatter",
mode = "lines",
line = list(color = 'green', width = 2)
)
# Set figure title, x and y-axes titles
lm_chart <- lm_chart %>% layout(
title = "Linear Regression of Daily New Cases",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
lm_chart
ARIMA Training
arima_model <- auto.arima(df_data$cases_active, trace = TRUE, ic = 'aicc', approximation = FALSE, stepwise = FALSE)
##
## ARIMA(0,1,0) : 12574.82
## ARIMA(0,1,0) with drift : 12576.06
## ARIMA(0,1,1) : 12201.06
## ARIMA(0,1,1) with drift : 12202.55
## ARIMA(0,1,2) : 12000.65
## ARIMA(0,1,2) with drift : 12002.3
## ARIMA(0,1,3) : 11949.82
## ARIMA(0,1,3) with drift : 11951.54
## ARIMA(0,1,4) : 11892.69
## ARIMA(0,1,4) with drift : 11894.47
## ARIMA(0,1,5) : 11833.57
## ARIMA(0,1,5) with drift : 11835.4
## ARIMA(1,1,0) : 11789.02
## ARIMA(1,1,0) with drift : 11790.96
## ARIMA(1,1,1) : 11637.72
## ARIMA(1,1,1) with drift : 11639.74
## ARIMA(1,1,2) : 11636.15
## ARIMA(1,1,2) with drift : 11638.18
## ARIMA(1,1,3) : 11637.46
## ARIMA(1,1,3) with drift : 11639.49
## ARIMA(1,1,4) : 11612.64
## ARIMA(1,1,4) with drift : 11614.67
## ARIMA(2,1,0) : 11710.42
## ARIMA(2,1,0) with drift : 11712.4
## ARIMA(2,1,1) : 11636.58
## ARIMA(2,1,1) with drift : 11638.61
## ARIMA(2,1,2) : 11638.07
## ARIMA(2,1,2) with drift : 11640.11
## ARIMA(2,1,3) : Inf
## ARIMA(2,1,3) with drift : 11640.24
## ARIMA(3,1,0) : 11684.41
## ARIMA(3,1,0) with drift : 11686.42
## ARIMA(3,1,1) : 11636.13
## ARIMA(3,1,1) with drift : 11638.16
## ARIMA(3,1,2) : Inf
## ARIMA(3,1,2) with drift : Inf
## ARIMA(4,1,0) : 11635.54
## ARIMA(4,1,0) with drift : 11637.56
## ARIMA(4,1,1) : 11623.02
## ARIMA(4,1,1) with drift : 11625.06
## ARIMA(5,1,0) : 11626.8
## ARIMA(5,1,0) with drift : 11628.84
##
##
##
## Best model: ARIMA(1,1,4)
## Series: df_data$cases_active
## ARIMA(1,1,4)
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4
## 0.9737 -0.6159 -0.1013 -0.0863 0.2176
## s.e. 0.0090 0.0383 0.0443 0.0453 0.0415
##
## sigma^2 estimated as 785818: log likelihood=-5800.26
## AIC=11612.52 AICc=11612.64 BIC=11639.89
forecast_length <- 30
arima_predict <- forecast(arima_model, forecast_length)
head(arima_predict)
## $method
## [1] "ARIMA(1,1,4)"
##
## $model
## Series: df_data$cases_active
## ARIMA(1,1,4)
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4
## 0.9737 -0.6159 -0.1013 -0.0863 0.2176
## s.e. 0.0090 0.0383 0.0443 0.0453 0.0415
##
## sigma^2 estimated as 785818: log likelihood=-5800.26
## AIC=11612.52 AICc=11612.64 BIC=11639.89
##
## $level
## [1] 80 95
##
## $mean
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## [1] 40614.16 40307.63 40048.57 39882.01 39719.83 39561.92 39408.15 39258.42
## [9] 39112.62 38970.65 38832.42 38697.81 38566.74 38439.11 38314.84 38193.83
## [17] 38076.00 37961.26 37849.54 37740.75 37634.82 37531.67 37431.23 37333.44
## [25] 37238.21 37145.48 37055.18 36967.26 36881.65 36798.29
##
## $lower
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## 80% 95%
## 709 39478.114 38876.7256
## 710 38391.848 37377.6936
## 711 37403.746 36003.6634
## 712 36566.876 34811.9487
## 713 35617.312 33445.5671
## 714 34581.553 31945.1080
## 715 33477.514 30338.0259
## 716 32317.611 28643.3705
## 717 31110.863 26874.9883
## 718 29864.124 25043.4190
## 719 28582.818 23157.0088
## 720 27271.371 21222.5802
## 721 25933.490 19245.8519
## 722 24572.339 17231.7126
## 723 23190.662 15184.4066
## 724 21790.866 13107.6638
## 725 20375.089 11004.7946
## 726 18945.235 8878.7600
## 727 17503.021 6732.2261
## 728 16049.997 4567.6059
## 729 14587.570 2387.0935
## 730 13117.025 192.6909
## 731 11639.533 -2013.7690
## 732 10156.171 -4230.6039
## 733 8667.926 -6456.2660
## 734 7175.708 -8689.3286
## 735 5680.356 -10928.4742
## 736 4182.644 -13172.4849
## 737 2683.288 -15420.2328
## 738 1182.949 -17670.6729
##
## $upper
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## 80% 95%
## 709 41750.21 42351.60
## 710 42223.42 43237.57
## 711 42693.38 44093.47
## 712 43197.15 44952.08
## 713 43822.36 45994.10
## 714 44542.28 47178.73
## 715 45338.78 48478.27
## 716 46199.22 49873.46
## 717 47114.38 51350.25
## 718 48077.18 52897.89
## 719 49082.01 54507.82
## 720 50124.25 56173.04
## 721 51199.99 57887.63
## 722 52305.88 59646.51
## 723 53439.01 61445.27
## 724 54596.79 63279.99
## 725 55776.90 65147.20
## 726 56977.28 67043.76
## 727 58196.05 68966.85
## 728 59431.50 70913.89
## 729 60682.07 72882.55
## 730 61946.32 74870.65
## 731 63222.94 76876.24
## 732 64510.70 78897.48
## 733 65808.48 80932.68
## 734 67115.25 82980.28
## 735 68430.01 85038.84
## 736 69751.88 87107.01
## 737 71080.02 89183.54
## 738 72413.63 91267.25
plot(arima_predict, main = "Predicted Active Cases", col.main = "black")

last_date <- as.Date(df_data[(nrow(df_data)):nrow(df_data),1], format="%Y-%m-%d")
last_date <- last_date + 1
df_arima <- data.frame(
date=seq(last_date, by = "day", length.out = forecast_length),
cases_active=arima_predict$mean
)
combined_prediction <- linear_model %>% predict(df_arima)
df_combined_predicted <- data.frame(date=df_arima$date, cases_new=combined_prediction)
df_data$month <- strftime(df_data$date, "%m")
df_data$year <- strftime(df_data$date, "%Y")
df_smooth <- df_data %>%
group_by(date=lubridate::floor_date(df_data$date, "month")) %>%
dplyr::summarize(cases_new = mean(cases_new)) %>%
data.frame
combined_chart <- plot_ly()
# Predicted Data
combined_chart <- combined_chart %>%
add_trace(
x = df_combined_predicted[["date"]], y = df_combined_predicted[["cases_new"]],
name = "Future Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
combined_chart <- combined_chart %>%
add_trace(
x = df_smooth[["date"]], y = df_smooth[["cases_new"]],
name = "Actual Data (Rolled to Monthly)",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
# Set figure title, x and y-axes titles
combined_chart <- combined_chart %>% layout(
title = "Prediction of Daily New Cases (Gaussian)",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
combined_chart
Fiddling with Poisson
poisson_model <- glm(cases_new~cases_active, data=data_train, family=poisson(link="log"))
summary(poisson_model)
##
## Call:
## glm(formula = cases_new ~ cases_active, family = poisson(link = "log"),
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -69.27 -50.04 -12.42 23.54 96.57
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.264e+00 1.223e-03 5938 <2e-16 ***
## cases_active 1.167e-05 7.085e-09 1647 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 3228235 on 495 degrees of freedom
## Residual deviance: 805402 on 494 degrees of freedom
## AIC: 809428
##
## Number of Fisher Scoring iterations: 5
# plot(gamma_model)
poisson_prediction <- poisson_model %>% predict(data_test)
poisson_compare <- data.frame(actual=data_test$cases_new, predicted=poisson_prediction)
head(poisson_compare)
## actual predicted
## 1 4 7.264387
## 5 3 7.264422
## 8 0 7.264434
## 10 0 7.264434
## 14 1 7.264504
## 16 1 7.264504
poisson_performance <- data.frame(
MODEL = "Poisson GLM",
R2 = R2(poisson_prediction, data_test$cases_new),
RMSE = RMSE(poisson_prediction, data_test$cases_new),
MAE = MAE(poisson_prediction, data_test$cases_new)
)
poisson_performance
## MODEL R2 RMSE MAE
## 1 Poisson GLM 0.9721543 6606.18 3816.115
# Chart init
df_predicted <- data.frame(date=data_test$date, cases_new=poisson_prediction)
poisson_chart <- plot_ly()
# Predicted Data
poisson_chart <- poisson_chart %>%
add_trace(
x = df_predicted[["date"]], y = df_predicted[["cases_new"]],
name = "Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
poisson_chart <- poisson_chart %>%
add_trace(
x = df_actual[["date"]], y = df_actual[["cases_new"]],
name = "Actual Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
poisson_chart <- poisson_chart %>%
add_trace(
x = df_train[["date"]], y = df_train[["cases_new"]],
name = "Train Data",
type = "scatter",
mode = "lines",
line = list(color = 'green', width = 1)
)
# Set figure title, x and y-axes titles
poisson_chart <- poisson_chart %>% layout(
title = "Poisson Regression of Daily New Cases",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
poisson_chart
It seems like the Poisson model is really bad just by the sight of looking at the graph. Therefore do not relies too much on statistical benchmark such as R2 as the source of truth. The easiest way to determine on hindsight is to visualize, use them!
Poisson Prediction
combined_prediction <- poisson_model %>% predict(df_arima)
df_combined_predicted <- data.frame(date=df_arima$date, cases_new=combined_prediction)
combined_chart <- plot_ly()
# Predicted Data
combined_chart <- combined_chart %>%
add_trace(
x = df_combined_predicted[["date"]], y = df_combined_predicted[["cases_new"]],
name = "Future Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
combined_chart <- combined_chart %>%
add_trace(
x = df_smooth[["date"]], y = df_smooth[["cases_new"]],
name = "Actual Data (Rolled to Monthly)",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
# Set figure title, x and y-axes titles
combined_chart <- combined_chart %>% layout(
title = "Prediction of Daily New Cases (Poisson)",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
combined_chart
## MODEL R2 RMSE MAE
## 1 Poisson GLM 0.9721543 6606.18 3816.115
## MODEL R2 RMSE MAE
## 1 Gaussian Linear 0.9721543 906.3865 545.7654
combined_performance <- rbind(linear_performance, poisson_performance)
combined_performance
## MODEL R2 RMSE MAE
## 1 Gaussian Linear 0.9721543 906.3865 545.7654
## 2 Poisson GLM 0.9721543 6606.1804 3816.1151
End of document